pacman::p_load(jsonlite,tidygraph,ggraph,visNetwork,graphlayouts,ggforce,skimr,
tidytext,tidyverse,ggstatsplot,ggiraph)Take-home Exercise 3
Getting Started
Background
FishEye analysts have attempted to use traditional node-link visualizations and standard graph analyses, but these were found to be ineffective because the scale and detail in the data can obscure a business’s true structure. Can you help FishEye develop a new visual analytics approach to better understand fishing business anomalies?
Data Wrangling
Importing R pacakges
Importing JSON file by using jsonlite packages
mc3_data <- fromJSON("data/MC3.json")Extracting edges
mc3_edges <- as_tibble(mc3_data$links) %>%
distinct() %>%
mutate(source = as.character(source),
target = as.character(target),
type = as.character(type)) %>%
group_by(source, target, type) %>%
summarise(weights=n()) %>%
filter(source!=target) %>%
ungroup()Extracting nodes
mc3_nodes <- as_tibble(mc3_data$nodes) %>%
mutate(country = as.character(country),
id = as.character(id),
product_services = as.character(product_services),
revenue_omu = as.numeric(as.character(revenue_omu)),
type = as.character(type)) %>%
select(id, country, type, revenue_omu, product_services)Exploring the edges data frame
Show the code
skim(mc3_edges)| Name | mc3_edges |
| Number of rows | 24036 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| source | 0 | 1 | 6 | 700 | 0 | 12856 | 0 |
| target | 0 | 1 | 6 | 28 | 0 | 21265 | 0 |
| type | 0 | 1 | 16 | 16 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| weights | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ |
Edge table
Show the code
DT::datatable(mc3_edges)Plotting by type
Show the code
ggplot(data = mc3_edges,
aes(x = type)) +
geom_bar()
Exploring the nodes data frame
Show the code
skim(mc3_nodes)| Name | mc3_nodes |
| Number of rows | 27622 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 6 | 64 | 0 | 22929 | 0 |
| country | 0 | 1 | 2 | 15 | 0 | 100 | 0 |
| type | 0 | 1 | 7 | 16 | 0 | 3 | 0 |
| product_services | 0 | 1 | 4 | 1737 | 0 | 3244 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| revenue_omu | 21515 | 0.22 | 1822155 | 18184433 | 3652.23 | 7676.36 | 16210.68 | 48327.66 | 310612303 | ▇▁▁▁▁ |
Node table
Show the code
DT::datatable(mc3_nodes)Plotting by type
Show the code
ggplot(data = mc3_nodes,
aes(x = type)) +
geom_bar()
Visualisation and Analysis
Building network model
id1 <- mc3_edges %>%
select(source) %>%
rename(id = source)
id2 <- mc3_edges %>%
select(target) %>%
rename(id = target)
mc3_nodes1 <- rbind(id1, id2) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop")mc3_graph <- tbl_graph(nodes = mc3_nodes1,
edges = mc3_edges,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
mc3_graph %>%
filter(betweenness_centrality >= 100000) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha=0.5)) +
geom_node_point(aes(
size = betweenness_centrality,
colors = "lightblue",
alpha = 0.5)) +
scale_size_continuous(range=c(1,10))+
theme_graph()
Text Sensing
Simple word count
mc3_nodes %>%
mutate(n_fish = str_count(product_services, "fish")) # A tibble: 27,622 × 6
id country type revenue_omu product_services n_fish
<chr> <chr> <chr> <dbl> <chr> <int>
1 Jones LLC ZH Comp… 310612303. Automobiles 0
2 Coleman, Hall and Lopez ZH Comp… 162734684. Passenger cars,… 0
3 Aqua Advancements Sashimi … Oceanus Comp… 115004667. Holding firm wh… 0
4 Makumba Ltd. Liability Co Utopor… Comp… 90986413. Car service, ca… 0
5 Taylor, Taylor and Farrell ZH Comp… 81466667. Fully electric … 0
6 Harmon, Edwards and Bates ZH Comp… 75070435. Discount superm… 0
7 Punjab s Marine conservati… Riodel… Comp… 72167572. Beef, pork, chi… 0
8 Assam Limited Liability … Utopor… Comp… 72162317. Power and Gas s… 0
9 Ianira Starfish Sagl Import Rio Is… Comp… 68832979. Light commercia… 0
10 Moran, Lewis and Jimenez ZH Comp… 65592906. Automobiles, tr… 0
# ℹ 27,612 more rows
Tokenisation
token_nodes <- mc3_nodes %>%
unnest_tokens(word,
product_services)
token_nodes %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in product_services field")
Removing stopwords
select nodes with “seafood”,“fish”,“carp”,“catfish”,“herring”,“mackerel”,“pollock”,“salmon”,“shark”,“tuna” as part of the product service
stopwords_removed <- token_nodes %>%
anti_join(stop_words) %>%
filter(word %in% c("seafood","fish","carp","catfish","herring","mackerel","pollock","salmon","shark","tuna")) %>%
distinct()
stopwords_removed %>%
count(word, sort = TRUE) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in product_services field")
Plotting by type
Show the code
ggplot(data = stopwords_removed,
aes(x = type)) +
geom_bar()+
geom_text(stat="count",
aes(label=paste0(after_stat(count))),vjust=-1)+
ylim(0,1500)
Analyze company type
Since from the above, the number of company type is much greater than the other 2 types, we will focus on the company type and find the distribution of revenue_omu
Show the code
clean_nodes_c <-stopwords_removed %>%
drop_na(revenue_omu) %>%
filter(type=="Company")
set.seed(1234)
gghistostats(
data = clean_nodes_c,
x = revenue_omu,
type = "bayes",
test.value = 60,
xlab = "revenue_omu"
)
Most of the companies have a revenue_omu within the first bar, but there are some companies that have far more revenue than others, we select the revenue_omu>400,000
Show the code
df_nodes <- clean_nodes_c %>%
filter(revenue_omu>200000)
df_edges <- mc3_edges %>%
filter(source %in% df_nodes$id)
id3 <- df_edges %>%
select(source) %>%
rename(id = source)
id4 <- df_edges %>%
select(target) %>%
rename(id = target)
df_nodes_1 <- rbind(id3, id4) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop")
df_graph <- tbl_graph(nodes = df_nodes_1,
edges = df_edges,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
g <- df_graph %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "kk") +
geom_edge_link(aes(width=weights),
alpha=0.2) +
scale_edge_width(range = c(0.01, 0.1)) +
geom_node_point(aes(colour = country,
size=betweenness_centrality))
g + theme_graph()
The companies in country ZH seems to have a high revenue_omu but they dont have many business partner records
Grouping
Calculate partner numbers (numbers of targets of a source), and assign partner = -1 if targets dont have a partner record, we only select those with a partner and group them by revenue_omu and partner numbers.
Show the code
df_edges_1 <- mc3_edges %>%
filter(source %in% clean_nodes_c$id)
df_edges_1r <- df_edges_1 %>%
group_by(source) %>%
summarize(partners=n_distinct(target)) %>%
rename(id=source) %>%
ungroup()
df_nodes_2 <- clean_nodes_c %>%
left_join(df_edges_1r) %>%
distinct()
df_nodes_2$partners[is.na(df_nodes_2$partners)] <- -1
df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 1
df_nodes_2$group[(df_nodes_2$partners>quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 2
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 3
df_nodes_2$group[(df_nodes_2$partners<=quantile(df_nodes_2$partners, 0.5)) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 4
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu<=quantile(df_nodes_2$revenue_omu, 0.8))] <- 5
df_nodes_2$group[(df_nodes_2$partners==-1) & (df_nodes_2$revenue_omu>quantile(df_nodes_2$revenue_omu, 0.8))] <- 6
df_nodes_2 <- df_nodes_2[,!names(df_nodes_2) %in%
c("word")] %>%
distinct()
set.seed(1234)
gghistostats(
data = df_nodes_2[df_nodes_2$partners>0,],
x = partners,
type = "bayes",
test.value = 60,
xlab = "partners")
Here we define Group 1 as No. of partners > 50% and revenue <= 80%
Group 2 as No. of partners > 50% and revenue > 80%
Group 3 as No. of partners <= 50% and revenue <= 80%
Group 4 as No. of partners <= 50% and revenue > 80%
Group 5 and 6 are groups with no partners, but revenue less than or equal to 80% and revenue greater than 80%
We then visualize the nodes and edges and since Group 5 and 6 dont have partners, they will not appear in the network. Group 4 is selected since they dont have many partners in business but they have high revenue_omu
Show the code
df_edges_2 <- mc3_edges %>%
filter(source %in% df_nodes_2[df_nodes_2$group==4,]$id)
id5 <- df_edges_2 %>%
select(source) %>%
rename(id = source)
id6 <- df_edges_2 %>%
select(target) %>%
rename(id = target)
df_nodes_3 <- rbind(id5, id6) %>%
distinct() %>%
left_join(mc3_nodes,
unmatched = "drop") %>%
left_join(df_nodes_2)
df_graph_3 <- tbl_graph(nodes = df_nodes_3,
edges = df_edges_2,
directed = FALSE) %>%
mutate(betweenness_centrality = centrality_betweenness(),
closeness_centrality = centrality_closeness())
g_3 <- df_graph_3 %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
ggraph(layout = "kk") +
geom_edge_link(aes(width=weights),
alpha=0.2) +
scale_edge_width(range = c(0.01, 0.1)) +
geom_node_text(aes(label = ifelse(group > 1, as.character(id), "")), size = 2)+
geom_node_point(aes(colour = group,
size=betweenness_centrality))
g_3 + theme_graph()